import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import missingno as msno
import ipywidgets as widgets
import geopandas as gpd
plt.style.use('seaborn-v0_8')
print("Plotly version:", plotly.__version__)
print('matplotlib: {}'.format(matplotlib.__version__))
Plotly version: 5.9.0 matplotlib: 3.7.1
df_trips = pd.read_csv('citibike-trips.csv')
df_stations = pd.read_csv('citibike-stations.csv')
df_trips.head(3)
| tripduration | starttime | stoptime | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bikeid | usertype | birth_year | gender | customer_plan | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2319 | 2016-03-09 13:08:21 | 2016-03-09 13:47:01 | 520 | W 52 St & 5 Ave | 40.759923 | -73.976485 | 363 | West Thames St | 40.708347 | -74.017134 | 23062 | Subscriber | 1972.0 | male | NaN |
| 1 | 313 | 2015-07-09 15:42:30 | 2015-07-09 15:47:44 | 520 | W 52 St & 5 Ave | 40.759923 | -73.976485 | 493 | W 45 St & 6 Ave | 40.756800 | -73.982912 | 16909 | Subscriber | 1968.0 | female | NaN |
| 2 | 798 | 2017-04-20 18:43:59 | 2017-04-20 18:57:17 | 520 | W 52 St & 5 Ave | 40.759923 | -73.976485 | 3258 | W 27 St & 10 Ave | 40.750182 | -74.002184 | 15555 | Subscriber | 1991.0 | male | NaN |
df_trips.dtypes
tripduration int64 starttime object stoptime object start_station_id int64 start_station_name object start_station_latitude float64 start_station_longitude float64 end_station_id int64 end_station_name object end_station_latitude float64 end_station_longitude float64 bikeid int64 usertype object birth_year float64 gender object customer_plan float64 dtype: object
df_trips['starttime'] = pd.to_datetime(df_trips['starttime'],format='%Y-%m-%d %H:%M:%S')
df_trips['Year'] = df_trips['starttime'].dt.strftime('%Y')
df_trips['Year_Month'] = df_trips['starttime'].dt.strftime('%Y-%b')
df_trips['Month'] = df_trips['starttime'].dt.strftime("%B")
df_trips['Day'] = df_trips['starttime'].dt.strftime('%d')
df_trips['Day_Name'] = df_trips['starttime'].dt.strftime("%A")
df_trips['starttime'].dtype
dtype('<M8[ns]')
# missingno library is a useful tool to vizualize the missing values
msno.bar(df_trips)
<Axes: >
df_trips['birth_year'].fillna(value = 0, inplace=True, downcast='int64')
df_trips['birth_year'].isnull().sum()
0
df_trips.drop('customer_plan',axis=1,inplace=True)
df_trips.describe().round(1)
| tripduration | start_station_id | start_station_latitude | start_station_longitude | end_station_id | end_station_latitude | end_station_longitude | bikeid | birth_year | |
|---|---|---|---|---|---|---|---|---|---|
| count | 473556.0 | 473556.0 | 473556.0 | 473556.0 | 473556.0 | 473556.0 | 473556.0 | 473556.0 | 473556.0 |
| mean | 944.2 | 879.4 | 40.7 | -74.0 | 873.1 | 40.7 | -74.0 | 20644.6 | 1743.6 |
| std | 6760.5 | 1058.2 | 0.0 | 0.0 | 1053.0 | 0.1 | 0.1 | 4464.8 | 638.5 |
| min | 60.0 | 72.0 | 40.6 | -74.0 | 72.0 | 0.0 | -74.0 | 14529.0 | 0.0 |
| 25% | 388.0 | 326.0 | 40.7 | -74.0 | 326.0 | 40.7 | -74.0 | 17103.0 | 1964.0 |
| 50% | 633.0 | 446.0 | 40.7 | -74.0 | 445.0 | 40.7 | -74.0 | 19615.5 | 1978.0 |
| 75% | 1069.0 | 521.0 | 40.8 | -74.0 | 520.0 | 40.8 | -74.0 | 23462.0 | 1986.0 |
| max | 1969746.0 | 3649.0 | 40.8 | -73.9 | 3654.0 | 40.8 | 0.0 | 33473.0 | 2001.0 |
def print_outliers_IQR(df):
q1=df.quantile(0)
q3=df.quantile(0.999)
IQR=q3-q1
return print(f"IQR:{IQR}\nq3:{q3}\nq1:{q1}")
print_outliers_IQR(df_trips['tripduration'])
IQR:20222.34500000137 q3:20282.34500000137 q1:60.0
def find_outliers_IQR(df):
q1=df.quantile(0.25)
q3=df.quantile(0.999)
IQR=q3-q1
outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]
return outliers
df_outliers = pd.DataFrame(find_outliers_IQR(df_trips['tripduration']))
df_outliers
| tripduration | |
|---|---|
| 183 | 183705 |
| 6278 | 60392 |
| 7904 | 187312 |
| 10147 | 257031 |
| 19019 | 64467 |
| ... | ... |
| 463432 | 224243 |
| 464148 | 63541 |
| 466584 | 445073 |
| 470997 | 170676 |
| 471099 | 61433 |
206 rows × 1 columns
df_outliers['tripduration'].min()
50608
df_trips = df_trips[(df_trips['tripduration']<df_outliers['tripduration'].min())]
df_trips_noZero_BthYears = df_trips[(df_trips['birth_year']>0)]
#checking row count without the null(0) birthdays
df_trips_noZero_BthYears.tripduration.count()
417468
df_trips.head(1)
| tripduration | starttime | stoptime | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bikeid | usertype | birth_year | gender | Year | Year_Month | Month | Day | Day_Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2319 | 2016-03-09 13:08:21 | 2016-03-09 13:47:01 | 520 | W 52 St & 5 Ave | 40.759923 | -73.976485 | 363 | West Thames St | 40.708347 | -74.017134 | 23062 | Subscriber | 1972 | male | 2016 | 2016-Mar | March | 09 | Wednesday |
df_days = df_trips.groupby('Day_Name').size()
df_days = df_days/df_days.sum()*100
df_days = df_days.reset_index(name='Percentage')
df_days
| Day_Name | Percentage | |
|---|---|---|
| 0 | Friday | 14.842294 |
| 1 | Monday | 14.344143 |
| 2 | Saturday | 12.316468 |
| 3 | Sunday | 12.013098 |
| 4 | Thursday | 15.374670 |
| 5 | Tuesday | 15.336854 |
| 6 | Wednesday | 15.772473 |
colorscales = px.colors.named_colorscales()
fig = px.bar(df_days,
category_orders={"Day_Name": ["Monday", "Tuesday", "Wednesday", "Thursday",'Friday','Saturday','Sunday']},
text=[f'{x:.1f}%' for x in df_days.Percentage], # Convert to a list and format each element
orientation="v",
x = df_days.Day_Name,
y = df_days.Percentage,
#color = df_days.Percentage,
#color_discrete_sequence = ['chartreuse'],
#color_continuous_scale='picnic',
labels={
"Day_Name": "Day Of Week",
"Percentage": "Percentagem",
}
)
fig.update_layout(yaxis_range=[0,20],
showlegend=False,
title='% of Riders By Day of Week',
)
#colorscales = px.colors.named_colorscales() #List to check color scales/palettes
fig.show()
df_years = df_trips.groupby('Year').size()
df_years = df_years.reset_index(name='Count')
df_years
| Year | Count | |
|---|---|---|
| 0 | 2013 | 50697 |
| 1 | 2014 | 80520 |
| 2 | 2015 | 99174 |
| 3 | 2016 | 102310 |
| 4 | 2017 | 140649 |
fig = px.bar(df_years,
category_orders={"Year": ["2013", "2014", "2015", "2016",'2017']},
text=[f'{x}' for x in df_years.Count], # Convert to a list and format each element
orientation="v",
x = df_years.Year,
y = df_years.Count,
#color = df_days.Percentage,
#color_discrete_sequence = ['chartreuse'],
#color_continuous_scale='picnic',
labels={
"Year": "Year",
"Count": "Number of Riders",
}
)
fig.update_layout(
showlegend=False,
title='Number of Riders By Year',
)
#colorscales = px.colors.named_colorscales() #List to check color scales/palettes
fig.show()
df_YM = df_trips.groupby(['Year_Month','Day']).size()
df_YM = df_YM.reset_index(name='Count')
df_YM
| Year_Month | Day | Count | |
|---|---|---|---|
| 0 | 2013-Aug | 01 | 205 |
| 1 | 2013-Aug | 02 | 375 |
| 2 | 2013-Aug | 03 | 293 |
| 3 | 2013-Aug | 04 | 373 |
| 4 | 2013-Aug | 05 | 361 |
| ... | ... | ... | ... |
| 1454 | 2017-Sep | 26 | 677 |
| 1455 | 2017-Sep | 27 | 667 |
| 1456 | 2017-Sep | 28 | 715 |
| 1457 | 2017-Sep | 29 | 704 |
| 1458 | 2017-Sep | 30 | 480 |
1459 rows × 3 columns
# Define the function to update the plot
def update_plot(Year_Month):
filtered_df = df_YM[(df_YM['Year_Month']== Year_Month)]
fig = px.bar(filtered_df,
#category_orders={"Year": ["2013", "2014", "2015", "2016",'2017']},
#text=[f'{x}' for x in df_years.Count], # Convert to a list and format each element
orientation="v",
x = filtered_df.Day,
y = filtered_df.Count,
#color = df_days.Percentage,
#color_discrete_sequence = ['chartreuse'],
#color_continuous_scale='picnic',
labels={
"Day": "Day of Month",
"Count": "Number of Riders per day",
}
)
fig.update_layout(
showlegend=False,
title='Number of Riders per Day',
)
#colorscales = px.colors.named_colorscales() #List to check color scales/palettes
fig.show()
Column_Values_Year = df_YM.Year_Month.unique()
dropdown_Year=widgets.Dropdown(
options=Column_Values_Year,
value=Column_Values_Year[0],
description='Year:',
disabled=False,
)
# Link slider to update function
widgets.interact(update_plot, Year_Month = dropdown_Year)
interactive(children=(Dropdown(description='Year:', options=('2013-Aug', '2013-Dec', '2013-Jul', '2013-Nov', '…
<function __main__.update_plot(Year_Month)>
df_gender = df_trips[df_trips['gender']!='unknown']
df_gender = df_gender.groupby(['gender']).size()
df_gender = df_gender/df_gender.sum()*100
df_gender = df_gender.reset_index(name='Percentage')
df_gender
| gender | Percentage | |
|---|---|---|
| 0 | female | 24.203666 |
| 1 | male | 75.796334 |
fig = px.pie(df_gender,
values=df_gender.Percentage,
labels={
"gender": "Gender",
"Percentage": "Percentage",
},
names=df_gender.gender,
hole=.3)
fig.update_layout(showlegend=True,
title='% of man and wom',
#textposition='inside',
#textinfo='percent+label'
)
#colorscales = px.colors.named_colorscales() #List to check color scales/palettes
fig.show()
df_trips.head(1)
| tripduration | starttime | stoptime | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bikeid | usertype | birth_year | gender | Year | Year_Month | Month | Day | Day_Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2319 | 2016-03-09 13:08:21 | 2016-03-09 13:47:01 | 520 | W 52 St & 5 Ave | 40.759923 | -73.976485 | 363 | West Thames St | 40.708347 | -74.017134 | 23062 | Subscriber | 1972 | male | 2016 | 2016-Mar | March | 09 | Wednesday |
start_station_name start_station_latitude start_station_longitude count (.size)
df_Start_Station = df_trips.groupby(['start_station_id','start_station_name','start_station_latitude','start_station_longitude']).size()
df_Start_Station = df_Start_Station.reset_index(name='Count')
df_Start_Station['start_station_name'].nunique()
df_Start_Station.loc[3] == df_Start_Station.loc[4]
df_Start_Station
| start_station_id | start_station_name | start_station_latitude | start_station_longitude | Count | |
|---|---|---|---|---|---|
| 0 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 1226 |
| 1 | 79 | Franklin St & W Broadway | 40.719116 | -74.006667 | 996 |
| 2 | 82 | St James Pl & Pearl St | 40.711174 | -74.000165 | 421 |
| 3 | 83 | Atlantic Ave & Fort Greene Pl | 40.683826 | -73.976323 | 537 |
| 4 | 116 | W 17 St & 8 Ave | 40.741776 | -74.001497 | 1957 |
| ... | ... | ... | ... | ... | ... |
| 892 | 3643 | E 41 St & 5 Ave | 40.752722 | -73.981236 | 73 |
| 893 | 3644 | Van Dyke St & Van Brunt St | 40.675816 | -74.014882 | 11 |
| 894 | 3647 | 48 Ave & 30 Pl | 40.741283 | -73.937259 | 5 |
| 895 | 3648 | Flushing Ave & Vanderbilt Ave | 40.697950 | -73.970776 | 3 |
| 896 | 3649 | W 129 St & Convent Ave | 40.814394 | -73.953247 | 3 |
897 rows × 5 columns
#px.set_mapbox_access_token(open(".mapbox_token").read())
fig = px.scatter_mapbox(df_Start_Station,
lat="start_station_latitude",
lon="start_station_longitude",
color="Count",
size="Count",
color_continuous_scale=px.colors.cyclical.IceFire,
size_max=15,
zoom=10,
center={'lat':40.7307, 'lon':-74.0060},
hover_name='start_station_name',
labels={
"start_station_name": "Station Name",
"start_station_latitude": "Latitude",
"start_station_longitude": "Longitude",
"Count": "Trip Count",
},)
fig.update_layout(mapbox_style="carto-positron")
fig.show()
df_Start_End_Station = df_trips.groupby(['start_station_id','start_station_name','start_station_latitude','start_station_longitude','end_station_latitude','end_station_longitude']).size()
df_Start_End_Station = df_Start_End_Station.reset_index(name='Trip_Count')
df_Start_End_Station = df_Start_End_Station[(df_Start_End_Station['start_station_id']==72)]
df_Start_End_Station
| start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_latitude | end_station_longitude | Trip_Count | |
|---|---|---|---|---|---|---|---|
| 0 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 40.683125 | -73.978951 | 1 |
| 1 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 40.685160 | -73.977114 | 1 |
| 2 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 40.690238 | -73.992031 | 1 |
| 3 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 40.692418 | -73.989495 | 1 |
| 4 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 40.694247 | -73.992159 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 264 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 40.798994 | -73.966217 | 1 |
| 265 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 40.799757 | -73.962113 | 1 |
| 266 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 40.801343 | -73.971146 | 4 |
| 267 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 40.804213 | -73.966991 | 2 |
| 268 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 40.812056 | -73.961255 | 1 |
269 rows × 7 columns
df_Start_End_Station = df_Start_End_Station.explode(['start_station_latitude', 'start_station_longitude', 'end_station_latitude', 'end_station_longitude', 'start_station_name'])
df_Start_End_Station
| start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_latitude | end_station_longitude | Trip_Count | |
|---|---|---|---|---|---|---|---|
| 0 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 40.683125 | -73.978951 | 1 |
| 1 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 40.685160 | -73.977114 | 1 |
| 2 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 40.690238 | -73.992031 | 1 |
| 3 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 40.692418 | -73.989495 | 1 |
| 4 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 40.694247 | -73.992159 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 264 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 40.798994 | -73.966217 | 1 |
| 265 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 40.799757 | -73.962113 | 1 |
| 266 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 40.801343 | -73.971146 | 4 |
| 267 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 40.804213 | -73.966991 | 2 |
| 268 | 72 | W 52 St & 11 Ave | 40.767272 | -73.993929 | 40.812056 | -73.961255 | 1 |
269 rows × 7 columns
# Create a line mapbox plot
fig = px.line_mapbox(df_Start_End_Station,
lat=['start_station_latitude', 'end_station_latitude'],
lon=['start_station_longitude', 'end_station_longitude'],
#hover_data=['start_station_latitude','end_station_longitude'],
#color='Trip_Count',
#line_width='Trip_Count',
zoom=11.3, center={'lat': 40.767272, 'lon': -73.993929},
mapbox_style='carto-positron')
fig.update_traces(#opacity=0.5,
line=dict(color='#195185'))
fig.show()
#fig.update_layout(mapbox_style="carto-positron")
#fig.show()